Decision Tress¶
Udacity: Intro to Machine Learning
Chapter 4
Note: Since required dependencies are not available, this is just a code snippet for now (instead of executing in cell blocks).
Finding Accuracy:
import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from sklearn import tree
from sklearn.metrics import accuracy_score
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
#################################################################################
########################## DECISION TREE #################################
def classify(features_train, labels_train):
### your code goes here--should return a trained decision tree classifer
X = features_train
Y = labels_train
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,Y)
return clf
#### your code goes here
clf = classify(features_train, labels_train)
# ask classifier to predict
labels_pred = clf.predict(features_test)
acc = accuracy_score(labels_test, labels_pred)
### be sure to compute the accuracy on the test set
def submitAccuracies():
return {"acc":round(acc,3)}
Output: {"message": "{'acc': 0.908}"}
Minimum Samples (2 and 50):
import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from sklearn.metrics import accuracy_score
features_train, labels_train, features_test, labels_test = makeTerrainData()
########################## DECISION TREE #################################
### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_50, respectively
def classify(features_train, labels_train, min_samples=2):
### your code goes here--should return a trained decision tree classifer
X = features_train
Y = labels_train
clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)
clf = clf.fit(X,Y)
return clf
clf_2 = classify(features_train, labels_train, 2)
clf_50 = classify(features_train, labels_train, 50)
labels_pred_2 = clf_2.predict(features_test)
labels_pred_50 = clf_50.predict(features_test)
acc_min_samples_split_2 = accuracy_score(labels_test, labels_pred_2)
acc_min_samples_split_50 = accuracy_score(labels_test, labels_pred_50)
def submitAccuracies():
return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
"acc_min_samples_split_50":round(acc_min_samples_split_50,3)}
Output: {"message": "{'acc_min_samples_split_50': 0.912, 'acc_min_samples_split_2': 0.908}"}
Entropy¶
from IPython.display import YouTubeVideo
# Entropy case https://youtu.be/Bd15qhUrKCI
YouTubeVideo('Bd15qhUrKCI')
# Entrop case part 2 https://youtu.be/L6J6BRFgDiI
YouTubeVideo('L6J6BRFgDiI')
Entropy Calculation for below case:
from math import log
parent_entropy = -(0.5*log(0.5,2) + 0.5*log(0.5,2)) # log base 2, so 2nd argument
parent_entropy # maximally impure
Information gain - based on grade
child_entropy_1 = -(2/3)*log(2/3,2)-(1/3)*log(1/3,2)
child_entropy_1
child_entropy_2 = 0 # as its only 'f'
information_gain = parent_entropy - ( (3/4)*child_entropy_1 + (1/4)*child_entropy_2 )
information_gain
Information gain - based on bumpiness
from graphviz import Digraph
g = Digraph()
g.node('A','ssff',xlabel='P')
g.node('B','sf',xlabel='C1')
g.node('C','sf',xlabel='C2')
g.edge('A','B',label='bumpy')
g.edge('A','C',label='smooth')
g
# Parent P : ssff
entropy_P = -(0.5*log(0.5,2) + 0.5*log(0.5,2))
# Child C1 : sf
entropy_C1 = -(0.5*log(0.5,2) + 0.5*log(0.5,2))
# Child C2 : sf
entropy_C2 = -(0.5*log(0.5,2) + 0.5*log(0.5,2))
information_gain = entropy_P - ( (2/4)*entropy_C1 + (2/4)*entropy_C2 )
information_gain
Information gain - based on speed limit
from graphviz import Digraph
g = Digraph()
g.node('A','ssff',xlabel='P')
g.node('B','ss',xlabel='C1')
g.node('C','ff',xlabel='C2')
g.edge('A','B',label='Speed limit yes')
g.edge('A','C',label='Speed limit no')
g
# Parent P : ssff
entropy_P = -(0.5*log(0.5,2) + 0.5*log(0.5,2))
# Child C1 : ss
entropy_C1 = -(1*log(1,2))
# Child C2 : ff
entropy_C2 = -(1*log(1,2))
information_gain = entropy_P - ( (2/4)*entropy_C1 + (2/4)*entropy_C2 )
information_gain
#!/usr/bin/python
"""
This is the code to accompany the Lesson 2 (SVM) mini-project.
Use a SVM to identify emails from the Enron corpus by their authors:
Sara has label 0
Chris has label 1
"""
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn import tree
from sklearn.metrics import accuracy_score
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess() # 10 is percentile
#########################################################
### your code goes here ###
def classify(features_train, labels_train, min_samples=2):
### your code goes here--should return a trained decision tree classifer
X = features_train
Y = labels_train
clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)
# train the classifier
t0 = time()
clf = clf.fit(X,Y)
print("Decision Tree Training time: " + str(round(time()-t0,3)) + " s")
return clf
clf_40 = classify(features_train, labels_train, 40)
# predict
t0 = time()
labels_pred_40 = clf_40.predict(features_test)
print("Decision Tree Prediction time: " + str(round(time()-t0,3)) + " s")
# accuracy
acc_min_samples_split_40 = accuracy_score(labels_test, labels_pred_40)
print("Decision Tree Predicted labels: " + str(len(labels_pred_40)))
print("Decision Tree Accuracy: " + str(accuracy_score(labels_test, labels_pred_40)))
#########################################################
Speeding it up
No of features in the data:
features_train, features_test, labels_train, labels_test = preprocess()
len(features_train[0])
After instructed by udacity mini project _"go into tools/email_preprocess.py, and find the line of code that looks like this: selector = SelectPercentile(fclassif, percentile=1) Change percentile from 10 to 1."
ipython has to reload, which means above code also produces same result as below, so modified processor to take that as argument.
from email_preprocess import preprocess
features_train, features_test, labels_train, labels_test = preprocess(percentile=1) # now percentile = 1
len(features_train[0])
Calculating accuracy again with selector percentil as 1
#!/usr/bin/python
"""
This is the code to accompany the Lesson 2 (SVM) mini-project.
Use a SVM to identify emails from the Enron corpus by their authors:
Sara has label 0
Chris has label 1
"""
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn import tree
from sklearn.metrics import accuracy_score
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(percentile=1)
#########################################################
### your code goes here ###
def classify(features_train, labels_train, min_samples=2):
### your code goes here--should return a trained decision tree classifer
X = features_train
Y = labels_train
clf = tree.DecisionTreeClassifier(min_samples_split=min_samples)
# train the classifier
t0 = time()
clf = clf.fit(X,Y)
print("Decision Tree Training time: " + str(round(time()-t0,3)) + " s")
return clf
clf_40 = classify(features_train, labels_train, 40)
# predict
t0 = time()
labels_pred_40 = clf_40.predict(features_test)
print("Decision Tree Prediction time: " + str(round(time()-t0,3)) + " s")
# accuracy
acc_min_samples_split_40 = accuracy_score(labels_test, labels_pred_40)
print("Decision Tree Predicted labels: " + str(len(labels_pred_40)))
print("Decision Tree Accuracy: " + str(accuracy_score(labels_test, labels_pred_40)))
#########################################################
So concluding when no of features reduced, the accuracy reduced.